#ifndef _BMP_H_
#define _BMP_H_

#include <stdint.h>

uint32_t *bmp_load(uint8_t *buf, uint64_t len, int *w, int *h);

#ifdef BMP_IMPLEMENTATION

typedef struct {
  uint8_t  bfType[2];               /* magic number "BM" */
  uint32_t bfSize;                  /* file size */
  uint16_t bfReserved1;
  uint16_t bfReserved2;
  uint32_t bfOffBits;               /* offset to image data */
  union {
      struct {
          uint32_t biSize;          /* size of bitmap info header */
          int32_t  biWidth;         /* image width */
          int32_t  biHeight;        /* image height */
          uint16_t biPlanes;        /* only 1 plane supported */
          uint16_t biBitCount;      /* bits per pixels */
          uint32_t biCompression;   /* compression type */
          uint32_t biSizeImage;     /* size of pixel data */
          uint32_t biXPelsPerMeter; /* pixels per meter on x-axis */
          uint32_t biYPelsPerMeter; /* pixels per meter on y-axis */
          uint32_t biClrUsed;       /* number of used colors */
          uint32_t biClrImportant;  /* number of important colors */
      } win;
      struct {
          uint32_t bcSize;          /* size of bitmap core header */
          uint16_t bcWidth;         /* image with */
          uint16_t bcHeight;        /* image height */
          uint16_t bcPlanes;        /* must be equal to 1 */
          uint16_t bcBitCount;      /* bits per pixel */
      } os2;
  } info;
} __attribute__((packed)) bmp_header_t;
enum { BI_RGB, BI_RLE8, BI_RLE4, BI_BITFIELDS };

uint32_t *bmp_load(uint8_t *buf, uint64_t len, int *w, int *h)
{
    bmp_header_t *bmp = (bmp_header_t*)buf;
    uint8_t comp, *cmap = NULL, *src, l, m, d, *c1, *c2;
    uint32_t i, j, k = 0, n, bc, nc, *ret, *dst;

    if(!buf || len < sizeof(bmp_header_t) || !w || !h || buf[0] != 'B' || buf[1] != 'M')
        return NULL;

    if(bmp->info.win.biSize < sizeof(bmp_header_t) - 14 || bmp->info.win.biCompression > 3) {
        if(bmp->info.os2.bcPlanes > 1) return NULL;
        *w = (uint32_t)bmp->info.os2.bcWidth;
        *h = (uint32_t)bmp->info.os2.bcHeight;
        comp = BI_RGB;
        bc = bmp->info.os2.bcBitCount;
        nc = 3;
        if(bc <= 8) cmap = buf + 26;
    } else {
        if(bmp->info.win.biPlanes > 1) return NULL;
        *w = bmp->info.win.biWidth;
        *h = bmp->info.win.biHeight;
        comp = bmp->info.win.biCompression;
        bc = bmp->info.win.biBitCount;
        nc = 4;
        if(bc <= 8) cmap = buf + sizeof(bmp_header_t);
    }
    n = (*w) * (*h);
    if(comp >= BI_BITFIELDS || *w < 1 || *h < 1 || (bc != 1 && bc != 4 && bc != 8 && bc != 15 && bc != 16 && bc != 24 && bc != 32)
        || bmp->bfOffBits >= len || !(ret = dst = (uint32_t*)malloc(n * 4))) return NULL;
    memset(ret, 0, n * 4);
    src = buf + bmp->bfOffBits;

    switch(comp) {
        case BI_RGB:
            switch(bc) {
                case 1:
                    for(i = 0, n >>= 3; i < n; i++, src++)
                        for(m = 0x80; m; m >>= 1) {
                            j = *src & m ? nc : 0;
                            *dst++ = (cmap[j + 2] << 16) | (cmap[j + 1] << 8) | cmap[j] | 0xff000000;
                        }
                break;
                case 4:
                    for(i = 0; i < n; i += 2, src++) {
                        j = (src[i] >> 4) * nc;
                        *dst++ = (cmap[j + 2] << 16) | (cmap[j + 1] << 8) | cmap[j] | 0xff000000;
                        j = (src[i] & 0xf) * nc;
                        *dst++ = (cmap[j + 2] << 16) | (cmap[j + 1] << 8) | cmap[j] | 0xff000000;
                    }
                break;
                case 8:
                    for(i = 0; i < n; i++, src++) {
                        j = *src * nc;
                        *dst++ = (cmap[j + 2] << 16) | (cmap[j + 1] << 8) | cmap[j] | 0xff000000;
                    }
                break;
                case 15:
                    for(i = 0; i < n; i++, src += 2) {
                        j = src[0] | (src[1] << 8);
                        *dst++ = (((j & 31) * 255 / 31)) |
                            ((((j >> 5) & 31) * 255 / 31) << 8) |
                            ((((j >> 10) & 31) * 255 / 31) << 16) | 0xff000000;
                    }
                break;
                case 16:
                    for(i = 0; i < n; i++, src += 2) {
                        j = src[0] | (src[1] << 8);
                        *dst++ = (((j & 31) * 255 / 31)) |
                            ((((j >> 5) & 63) * 255 / 63) << 8) |
                            ((((j >> 11) & 31) * 255 / 31) << 16) | 0xff000000;
                    }
                break;
                case 24:
                    for(i = 0; i < n; i++, src += 3)
                        *dst++ = (src[2] << 16) | (src[1] << 8) | src[0] | 0xff000000;
                break;
                case 32:
                    for(i = 0; i < n; i++, src += 4)
                        *dst++ = (src[3] << 24) | (src[2] << 16) | (src[1] << 8) | src[0];
                break;
            }
        break;
        case BI_RLE8:
            while(dst < (uint32_t*)buf + n) {
                m = *src++; l = *src++;
                if(!m) {
                    for(i = 0; i < l; i++) {
                        j = *src++ * nc;
                        *dst++ = (cmap[j] << 16) | (cmap[j + 1] << 8) | cmap[j + 2] | 0xff000000;
                    }
                    if(l & 1) src++;
                } else {
                    for(j = l * nc, i = 0; i < m; i++)
                        *dst++ = (cmap[j] << 16) | (cmap[j + 1] << 8) | cmap[j + 2] | 0xff000000;
                }
            }
        break;
        case BI_RLE4:
            while(dst < (uint32_t*)buf + n) {
                m = *src++; l = *src++;
                if(!m) {
                    for(d = 0, i = 0; i < l; i++) {
                        if(i & 1) j = (d & 0xf) * nc; else { d = *src++; j = (d >> 3) * nc; k++; }
                        *dst++ = (cmap[j] << 16) | (cmap[j + 1] << 8) | cmap[j + 2] | 0xff000000;
                    }
                    if(k & 1) src++;
                } else {
                    for(d = 0, i = 0; i < m; i++) {
                        j = ((i & 1) ? (l & 0xf) : (l >> 4)) * nc;
                        *dst++ = (cmap[j] << 16) | (cmap[j + 1] << 8) | cmap[j + 2] | 0xff000000;
                    }
                }
            }
        break;
    }
    for(c1 = (uint8_t*)ret, j = 0; j < (uint32_t)(*h) / 2; j++)
        for(c2 = (uint8_t*)(ret + (*h - 1 - j) * (*w)), i = 0; i < (uint32_t)(*w); i++, c1 += 4, c2 += 4) {
            d = c1[0]; c1[0] = c2[2]; c2[2] = d;
            d = c1[1]; c1[1] = c2[1]; c2[1] = d;
            d = c1[2]; c1[2] = c2[0]; c2[0] = d;
            d = c1[3]; c1[3] = c2[3]; c2[3] = d;
        }
    if((*h) & 1)
        for(i = 0; i < (uint32_t)(*w); i++, c1 += 4) {
            d = c1[0]; c1[0] = c1[2]; c1[2] = d;
        }
    return ret;
}

#endif

#endif
